Read data
Get list of samples
Code
samples <- read_tsv ("config/samples.tsv" ,
col_types = list (
sample_name = col_character (),
cell_line = col_factor (),
exogenous_rna = col_factor (),
day = col_factor ()
)
)
units <- read_tsv ("config/units.tsv" ,
col_types = list (
sample_name = col_character (),
unit_name = col_character (),
fq1 = col_character (),
fq2 = col_character ()
)
)
sample_units <- dplyr:: left_join (samples, units, by = "sample_name" ) %>%
unite (sample_unit, sample_name, unit_name, remove = FALSE )
sample_units
Read Samtools idxstats to get human coverage for normalization
Notes:
The counts include the total number of reads aligned, they are not limited to uniquely aligned reads.
The counts are reads, not pairs or fragments
Code
idxstats_exogenousrna_dir <-
"results/samtools_idxstats/exogenous_rna/"
idxstats_human_dir <-
"results/samtools_idxstats/Homo_sapiens.GRCh38.dna.primary_assembly/"
bowtie2_human_logs <-
"results/logs/bowtie2/Homo_sapiens.GRCh38.dna.primary_assembly/"
idxstats <- tibble ()
for (row in seq_len (nrow (sample_units))) {
sample <- sample_units[row, ]$ sample_unit
# Read `idsxstats` for exogenous mapped reads
exogenous_rna_stats <- read_tsv (
file.path (idxstats_exogenousrna_dir, sprintf ("%s.bam.idxstats" , sample)),
col_names = c (
"sequence_name" , "sequence_length" ,
"mapped_reads" , "unmapped_reads"
),
col_types = "ciii"
)
exogenous_rna_mapped_reads <- exogenous_rna_stats %>%
dplyr:: filter (! sequence_name %in% c ("*" )) %>%
dplyr:: select (sequence_name, mapped_reads) %>%
dplyr:: mutate (sample = sample)
# Read `idxstats` for human mapped reads
human_stats <- read_tsv (
file.path (idxstats_human_dir, sprintf ("%s.bam.idxstats" , sample)),
col_names = c (
"sequence_name" , "sequence_length" ,
"mapped_reads" , "unmapped_reads"
),
col_types = "ciii"
)
grch38_mapped_reads <- human_stats %>%
dplyr:: filter (! sequence_name %in% c ("*" )) %>%
dplyr:: select (mapped_reads) %>%
sum ()
grch38_mapped_reads <- tibble (
sequence_name = "grch38_mapped_reads" ,
mapped_reads = grch38_mapped_reads,
sample = sample
)
# Read bowtie2 logs for unmapped reads
bowtie2_log <- readLines (
file.path (bowtie2_human_logs, sprintf ("%s.log" , sample))
)
total_pairs <- strtoi (str_split (bowtie2_log[1 ], " " )[[1 ]][1 ])
total_reads <- total_pairs * 2
unmapped_reads <- tibble (
sequence_name = "unmapped" ,
mapped_reads = total_reads - grch38_mapped_reads$ mapped_reads,
sample = sample
)
# Consolidate counts for rows
idxstats <- rbind (
idxstats,
exogenous_rna_mapped_reads,
grch38_mapped_reads,
unmapped_reads
)
}
idxstats
Sample Correlation
BAM Correlation
Divide the human genome into bins, determine the counts in each of those bins and calculate the correlation between samples.
Note: May need to remove zero count bins
Correlation of gene counts
Using featureCounts, we generated fragment counts for “exon” features in the Ensemble hg38 annotation. Primary alignments were counted, even if the fragements were mulitmappers.
Code
featureCounts <- readr:: read_tsv (
"results/alignments/Homo_sapiens.GRCh38.dna.primary_assembly/featureCounts/all_counts.featureCounts" ,
comment = "#" ,
col_types = list (
Geneid = col_character (),
Chr = col_character (),
Start = col_character (),
End = col_character (),
Strand = col_character (),
.default = col_integer ()
)
) %>%
rename_all (~ stringr:: str_replace_all (
., "results/alignments/Homo_sapiens.GRCh38.dna.primary_assembly/sorted/" , ""
)) %>%
rename_all (~ stringr:: str_replace_all (., ".bam" , "" ))
featureCounts
Code
dds <- DESeqDataSetFromMatrix (
countData = featureCounts %>%
tibble:: column_to_rownames ("Geneid" ) %>%
dplyr:: select (! Chr: Length),
colData = sample_units,
design = ~ exogenous_rna + day + cell_line
)
dds
class: DESeqDataSet
dim: 62703 24
metadata(1): version
assays(1): counts
rownames(62703): ENSG00000160072 ENSG00000279928 ... ENSG00000277475
ENSG00000275405
rowData names(0):
colnames(24): Parental_mastermix1_day1_rep1
Parental_mastermix1_day1_rep2 ... P1E10_sorted_mastermix2_day2_rep2
P1E10_sorted_mastermix2_day2_rep3
colData names(8): sample_unit sample_name ... fq1 fq2
Heatmap of sample-to-sample distances
Code
sampleDists <- dist (t (assay (vsd)))
sampleDistMatrix <- as.matrix (sampleDists)
rownames (sampleDistMatrix) <- paste (vsd$ cell_line,
vsd$ exogenous_rna,
paste0 ("day" , vsd$ day),
sep = "-"
)
colnames (sampleDistMatrix) <- NULL
colors <- colorRampPalette (rev (brewer.pal (9 , "Blues" )))(255 )
pheatmap (sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors
)
Differetial Expression
Default DESeq2 size factors
Code
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Code
res <- results (dds, alpha = 0.05 )
summary (res)
out of 44532 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 737, 1.7%
LFC < 0 (down) : 1134, 2.5%
outliers [1] : 25, 0.056%
low counts [2] : 27026, 61%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Code
resLFC <- lfcShrink (dds, coef= "cell_line_P1E10_vs_Parental" , type= "apeglm" )
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
Code
plotMA (res, ylim= c (- 5 ,5 ))
Code
plotMA (resLFC, ylim= c (- 5 ,5 ))
Code
plotCounts (dds, gene= which.min (res$ padj), intgroup= "cell_line" )
Reads mapped to hg38 size factors
Code
hg38_size_factors <- idxstats %>%
dplyr:: filter (sequence_name == "grch38_mapped_reads" ) %>%
dplyr:: select (sample, mapped_reads) %>%
deframe ()
sizeFactors (dds) <- (hg38_size_factors / median (hg38_size_factors))
dds
class: DESeqDataSet
dim: 62703 24
metadata(1): version
assays(4): counts mu H cooks
rownames(62703): ENSG00000160072 ENSG00000279928 ... ENSG00000277475
ENSG00000275405
rowData names(30): baseMean baseVar ... deviance maxCooks
colnames(24): Parental_mastermix1_day1_rep1
Parental_mastermix1_day1_rep2 ... P1E10_sorted_mastermix2_day2_rep2
P1E10_sorted_mastermix2_day2_rep3
colData names(9): sample_unit sample_name ... fq2 sizeFactor
Code
using pre-existing size factors
found already estimated dispersions, replacing these
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Code
res <- results (dds, alpha = 0.05 )
summary (res)
out of 44532 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 777, 1.7%
LFC < 0 (down) : 1066, 2.4%
outliers [1] : 23, 0.052%
low counts [2] : 26182, 59%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Code
resLFC <- lfcShrink (dds, coef= "cell_line_P1E10_vs_Parental" , type= "apeglm" )
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
Code
plotMA (res, ylim= c (- 5 ,5 ))
Code
plotMA (resLFC, ylim= c (- 5 ,5 ))
Code
plotCounts (dds, gene= which.min (res$ padj), intgroup= "cell_line" )
Gene Biotype
For each gene_id in the featureCounts table, we look up the gene_biotype from Ensembl.
Code
hg38 <- EnsDb.Hsapiens.v86
hg38_genes <- genes (hg38, return.type = "data.frame" )
counts_by_biotype <- featureCounts %>%
full_join (hg38_genes %>% dplyr:: select (gene_id, gene_biotype),
by = c ("Geneid" = "gene_id" )
) %>%
group_by (gene_biotype) %>%
dplyr:: select ("gene_biotype" , ! c (Geneid, Chr, Start, End, Strand, Length)) %>%
summarise_all (list (count = sum)) %>%
tidyr:: pivot_longer (! gene_biotype, names_to = "sample" , values_to = "count" )
counts_by_biotype
Code
p <- ggplot (data = subset (counts_by_biotype, ! is.na (count)), aes (x = sample, y = count, fill = gene_biotype)) +
geom_bar (stat = "identity" , position = "fill" ) +
theme (axis.text.x = element_text (angle = 90 , vjust = 0.5 , hjust = 1 ))
p